建立随机模型,replace=T表示有放回,prob是概率比例。

population.values <- 1:3
probs <- c(5,2,3)
my.draw <- sample (population.values, 100000, probs, replace=TRUE)
table(my.draw)
## my.draw
##     1     2     3 
## 50080 20020 29900

replicate生成多组数据,下面是无放回的,所以相当于随机排序。

sample(1:6)
## [1] 6 2 4 1 5 3
replicate(3,sample(c("Curly","Larry","Moe","Shemp")))
##      [,1]    [,2]    [,3]   
## [1,] "Moe"   "Shemp" "Shemp"
## [2,] "Curly" "Moe"   "Curly"
## [3,] "Shemp" "Curly" "Larry"
## [4,] "Larry" "Larry" "Moe"

bootstrp.resample来生成重抽样(replace=T)

bootstrap.resample <- function (object){
  sample (object, length(object), replace=TRUE)}
replicate(5, bootstrap.resample (6:10))
##      [,1] [,2] [,3] [,4] [,5]
## [1,]    9   10    9    7    9
## [2,]    7    8    7    9    9
## [3,]    7    6    8    7    8
## [4,]    8    8    9   10    7
## [5,]    6    7   10    8   10

对cat数据集进行重抽样看均值差异。每次运行结果会不一样。

diff.in.means <- function(df) {
  mean(df[df$Sex=="M","Hwt"]) - mean(df[df$Sex=="F","Hwt"])
}
resample.diffs <- replicate(1000, 
  diff.in.means(cats[bootstrap.resample(1:nrow(cats)),]))

得到的图像基本是正态的。

tibble(d = resample.diffs) %>% ggplot(mapping = aes(x = d))+
  geom_histogram(bins = 25)+
  geom_vline(aes(xintercept = diff.in.means(cats)), col='red')

quantile(resample.diffs,prob=c(0.025,0.975))
##     2.5%    97.5% 
## 1.488902 2.733772
qnorm(c(0.025,0.975),mean=mean(resample.diffs),
      sd=sd(resample.diffs))
## [1] 1.461083 2.762555

#Markov链 天气与前一天的晴雨有关,但与再之前的无关,是Markov。

sunny.year <- rep(NA, 365)
sunny.year[1] <- 1
for (day in 2:365) {
  sunny.year[day] <- rbinom(1,1,0.2 + 0.6*sunny.year[day-1])}
tibble(x = 1:365, y = sunny.year) %>% ggplot(aes(x = x, y = y))+
  geom_line()+
  labs(title="Sunny Days in An Equatorial City", x="Day", y="Sunshine?")

若天气与之前无关,即晴雨概率相等,则有下图,明显不同。

boring.year <- tibble(day = 1:365, rain = rbinom(365, 1, 0.5))
boring.year %>% ggplot(aes(x = day, y = rain))+ 
  geom_line()+
  labs(title="Sunny Days in A Coin Flip City", x="Day", ylab="Sunshine?")

奇奇怪怪年,下一天和上一天相反概率大。

weird.year <- rep(NA, 365)
weird.year[1] <- 1
transition.matrix <- matrix (c(0.2, 0.8, 0.8, 0.2), nrow=2)
for (day in 2:365) weird.year[day] <- 
  sample(1:2, 1, prob=transition.matrix[weird.year[day-1],])
tibble(day = 1:365, rain = weird.year) %>% 
ggplot(aes(x = day, y = rain))+
  geom_line()+
  labs(title="Sunny Days in Al Gore's Nightmare", x="Day", y="Sunshine?")

##标准Markov函数的形式,transition.matrix是转移矩阵。

rmarkovchain <- function (nn, transition.matrix, 
    start=sample(1:nrow(transition.matrix), 1)) {
  output <- rep (NA, nn)
  output[1] <- start
  for (day in 2:nn) {
    output[day] <- 
    sample(ncol(transition.matrix), 1, 
           prob=transition.matrix[output[day-1],])}
}

一个简单的Markov链:随机游走(output记录醉汉的位置)

randomwalk <- function (nn, upprob=0.5, start=50) {
  output <- rep (NA, nn)
  output[1] <- start
  for (iteration in 2:nn) 
    output[iteration] <- 
      output[iteration-1] + 2*rbinom(1, 1, upprob) - 1
  output  
}
tibble(x = 1:10000, y = randomwalk(10000, start=200)) %>% 
  ggplot(aes(x, y))+ geom_line()+
  labs(title="Simple Random Walk")

用随机变量计算积分,由

\[\frac{1}{n}\sum_{i=1}^{n}{f(X_i)} \rightarrow \mathbb{E}_{p}[f(X)] = \int{f(x) p(x) dx}\]

若要求\(\int g(x)dx\),则取\(f(x)=g(x)/p(x)\)

x1 <- runif(300, 0, 1); y1 <- runif(300, 0, 2.6); 
selected <- y1 < dbeta(x1, 3, 6)

选出来落在dbeta之内的即可估算面积。

利用选出的点进行分布计算。

mean(selected)
## [1] 0.3266667
accepted.points <- x1[selected]
mean(accepted.points < 0.5)
## [1] 0.877551
pbeta(0.5, 3, 6)
## [1] 0.8554688

Metropolis Algorithm

metropolis.one <- function (xx, fun=dpois, ...) {
  prop <- 2*rbinom(1,1,0.5) - 1 + xx#随机游动
  acc.ratio <- fun(prop, ...)/fun(xx, ...)#fun是个函数
  output <- if (acc.ratio > runif(1)) prop else xx
  output
}
replicate (10, metropolis.one(10, lambda=5))
##  [1] 10 10  9 10  9  9  9 10 11  9
start <- 50
draws <- rep(NA, 10000)
for (iteration in 1:10000) {
  start <- metropolis.one (start, lambda=10)
  draws[iteration] <- start
}
tibble(n = 1:10000, draws = draws) %>% 
  ggplot(aes(n, draws))+ geom_point()